source("../modularity_analysis_functions.R", local = knitr::knit_global())# reading the microbiome data
# working only with Rattus from the three villages
# villages: Andatsakala, Mandena, Sarahandrano
data_asv <- read_csv("../../data/data_processed/microbiome/data_asv_rra0.001_p0.01_th5000_all.csv")# small mammals data
small_mammals <- read_csv("../../data/data_raw/data_small_mammals/Terrestrial_Mammals.csv") %>%
mutate(host_ID = as.numeric(gsub(".*?([0-9]+).*", "\\1", animal_id))) %>%
mutate(age_repro = as_factor(age_repro)) %>%
dplyr::rename(grid = habitat_type) %>%
filter(host_ID %in% data_asv$host_ID)
cat("## Abundance", '\n','\n')g <- small_mammals %>%
count(village,grid) %>%
ggplot(aes(x=grid, y=n)) +
geom_bar(position="dodge", stat="identity") +
theme_bw() +
theme(axis.text = element_text(size = 10, color = 'black', angle = 90, vjust = 0.5, hjust=1), title = element_text(size = 14)) +
labs(x="Land Use", y="Small-Mammals Abundance")
print(g)cat("## Sex ratio", '\n','\n')g <- small_mammals %>%
count(grid, sex) %>%
ggplot(aes(fill=sex, x=grid, y=n)) +
geom_bar(position="fill", stat="identity") +
geom_hline(yintercept = 0.5, linetype = "dashed") +
theme_bw() +
theme(axis.text = element_text(size = 10, color = 'black', angle = 90, vjust = 0.5, hjust=1), title = element_text(size = 14)) +
labs(x="Land Use", y="Proportion")
print(g)cat("## Age ratio", '\n','\n')g <- small_mammals %>%
count(grid, age_repro) %>%
ggplot(aes(fill=age_repro, x=grid, y=n)) +
geom_bar(position="fill", stat="identity") +
theme_bw() +
theme(axis.text = element_text(size = 10, color = 'black', angle = 90, vjust = 0.5, hjust=1), title = element_text(size = 14)) +
labs(x="Land Use", y="Proportion")
print(g)# relative abundance of the 10 most abundant Family of each group
# microbes taxonomy
tax <- read_delim("../../data/data_raw/data_microbiome/ASVs_taxonomy_new.tsv") %>%
dplyr::rename(asv_ID = ASV)
data_asv_tax <- data_asv %>%
left_join(tax, by="asv_ID")
cat("## Relative abundance Family", '\n','\n')total_reads_groups <- data_asv_tax %>% distinct(host_ID, asv_core, total_reads) %>% group_by(asv_core) %>% summarise(n_total = sum(total_reads))
most_abu_family <- data_asv_tax %>%
mutate(reads_a = reads*total_reads) %>%
group_by(asv_core, Family) %>%
summarise(n= sum(reads_a)) %>%
left_join(total_reads_groups, by="asv_core") %>%
mutate(n_p = n/n_total)
most_abu_family_8 <- most_abu_family %>%
filter(!is.na(Family)) %>%
ungroup() %>%
slice_max(by = asv_core, order_by = n_p, n = 6) %>%
mutate(p = "top") %>%
select(asv_core, Family, p)
most_abu_family_p <- data_asv_tax %>%
mutate(reads_a = reads*total_reads) %>%
group_by(asv_core, Family, grid) %>%
summarise(n= sum(reads_a)) %>%
left_join(total_reads_groups, by="asv_core") %>%
mutate(n_p = n/n_total) %>%
left_join(most_abu_family_8, by=c("asv_core","Family")) %>%
mutate(p = case_when(p=="top" ~ Family,
is.na(Family) ~ ".NA",
is.na(p) ~ ".Other"))
library("RColorBrewer")
colors <- c("grey40","grey80","darkblue","#ffd60a",brewer.pal(n = 12, name = "Paired"))
g <- most_abu_family_p %>%
group_by(asv_core,grid, p) %>%
summarise(n_p = sum(n_p)) %>%
ggplot(aes(fill=p, x=grid, y=n_p)) +
geom_bar(position="fill", stat="identity") +
facet_wrap(~asv_core) +
theme_bw() +
theme(axis.text = element_text(size = 10, color = 'black', angle = 90, vjust = 0.5, hjust=1), title = element_text(size = 14),
strip.text = element_text(size=12, color = 'black'),
panel.grid = element_blank(), panel.background = element_rect(colour = "black"), legend.key.size = unit(0.5, "cm")) +
scale_fill_manual(values=colors) +
labs(x="Land Use", y="Relative Abundance", fill = "Family")
print(g)cat('\n','\n')set.seed(123)
data_asv_mat <- data_asv %>%
#filter(asv_core == "Core") %>%
select(host_ID, asv_ID, reads) %>%
spread(asv_ID, reads, fill = 0) %>%
column_to_rownames("host_ID") %>%
as.matrix()
hosts <- small_mammals %>%
filter(host_ID %in% rownames(data_asv_mat)) %>%
select(host_ID,village,grid,season,month,mass, sex,age_repro,age_dental,elevation.obs) %>%
mutate(village=as.factor(village),grid=as.factor(grid), season=as.factor(season),month=as.factor(month), sex=as.factor(sex), age_dental=as.factor(age_dental))
distance_matrix <- vegdist(data_asv_mat, method = "bray")
# Perform PERMANOVA
permanova_result <- adonis2(distance_matrix ~ grid+season+village, data = hosts, permutations = 999)
print(knitr::kable(permanova_result[,4:5]))| F | Pr(>F) | |
|---|---|---|
| grid | 2.331399 | 0.001 |
| season | 4.768973 | 0.001 |
| village | 5.666093 | 0.001 |
| Residual | NA | NA |
| Total | NA | NA |
cat('\n')# PERMANOVA post-hoc
# perm_post_grid <- RVAideMemoire::pairwise.perm.manova(distance_matrix, fact = hosts$grid,
# test = "bonferroni", nperm = 999, progress = FALSE)$p.value %>% as.data.frame()
# print(knitr::kable(perm_post_grid))
# cat('\n')
#NMDS
nmds_result <- metaMDS(distance_matrix, distance = "bray", k = 2, trace = F)
# preparing data for plotting
nmds_plot <- nmds_result$points %>%
as.data.frame() %>%
rownames_to_column("host_ID") %>%
mutate(host_ID = as.double(host_ID)) %>%
left_join(hosts, by="host_ID")
# plotting
g1 <- nmds_plot %>%
ggplot( aes(MDS1, MDS2, color=grid, shape=village)) +
geom_point(size = 2, position=position_jitter(.01)) +
#stat_ellipse(aes(fill=grid), alpha=.1, type='norm',linetype =2, geom="polygon") + ##draws 95% confidence interval ellipses
theme_bw() +
annotate("text", x=min(nmds_plot$MDS1)+0.04, size=3, y=max(nmds_plot$MDS2), label=paste('Stress =',round(nmds_result$stress,3))) +
labs(x = "NMDS1", y = "NMDS2", title = "(A) Bray-Curtis")
print(g1)set.seed(123)
library(GUniFrac)
rooted_phylo_tree <- phangorn::midpoint(phylo_tree)
distance_matrix <- GUniFrac(data_asv_mat, rooted_phylo_tree, verbose = F)$unifracs[, , "d_UW"]
# Perform weighted UniFrac
unifrac_result <- adonis2(distance_matrix ~ grid+season+village, data = hosts, permutations = 999)
print(knitr::kable(unifrac_result[,4:5]))| F | Pr(>F) | |
|---|---|---|
| grid | 2.644989 | 0.001 |
| season | 6.105313 | 0.001 |
| village | 8.450150 | 0.001 |
| Residual | NA | NA |
| Total | NA | NA |
cat('\n')# PERMANOVA post-hoc
# perm_post_grid <- RVAideMemoire::pairwise.perm.manova(distance_matrix, fact = hosts$grid,
# test = "bonferroni", nperm = 999, progress = FALSE)$p.value %>% as.data.frame()
# print(knitr::kable(perm_post_grid))
# cat('\n')
#NMDS
nmds_result2 <- metaMDS(distance_matrix, k = 2, trace = F)
# preparing data for plotting
nmds_plot2 <- nmds_result2$points %>%
as.data.frame() %>%
rownames_to_column("host_ID") %>%
mutate(host_ID = as.double(host_ID)) %>%
left_join(hosts, by="host_ID")
# plotting
g2 <- nmds_plot2 %>%
ggplot( aes(MDS1, MDS2, color=grid, shape=season)) +
geom_point(size = 2, position=position_jitter(.05)) +
#stat_ellipse(aes(fill=grid), alpha=.1, type='norm',linetype =2, geom="polygon") + ##draws 95% confidence interval ellipses
theme_bw() +
annotate("text", x=min(nmds_plot2$MDS1)+0.15, size=3, y=max(nmds_plot2$MDS2), label=paste('Stress =',round(nmds_result2$stress,3))) +
labs(x = "NMDS1", y = "NMDS2", title = "(B) Weighted UniFrac")# plotting
g1 + g2 + plot_layout(guides='collect') &
theme(legend.position='bottom', legend.spacing.x=unit(0.1, 'cm'))cat('\n','\n')# taking only ASVs with known Genus
data_tax <- data_asv_tax %>%
distinct(host_ID, asv_core, Genus) %>%
filter(!is.na(Genus))
# known ASVs
asv_genus <- data_asv_tax %>%
distinct(asv_ID, Genus) %>%
mutate(name = is.na(Genus))
print(paste("Percentege of known ASVs:", round(table(asv_genus$name)[1]/nrow(asv_genus)*100, 2)))[1] “Percentege of known ASVs: 41.26”
cat('\n')# how many Genus?
print(paste("Number of genera:", length(unique(data_tax$Genus))))[1] “Number of genera: 119”
cat('\n')# making unique samples (host_ID+asv_core)
data_tax_meta <- data_tax %>%
arrange(host_ID) %>%
unite(col="sample", host_ID:asv_core, remove = FALSE)
data_tax_meta2 <- data_tax_meta %>% distinct(sample, asv_core)
data_tax_summary <- data_tax %>%
count(asv_core, Genus) %>%
spread(Genus, n, fill = 0)
# transforming to matrix
data_tax_mat <- data_tax_meta %>%
select(sample, Genus) %>%
mutate(n=1) %>%
spread(Genus, n, fill = 0) %>%
column_to_rownames("sample") %>%
as.matrix()
# Perform PERMANOVA
permanova_result <- adonis2(data_tax_mat ~ asv_core, data = data_tax_meta2, method = "jaccard", permutations = 999)
print(paste("F =", permanova_result$F[1]))[1] “F = 155.763265711577”
cat('\n')print(paste("p-value =", permanova_result$`Pr(>F)`[1]))[1] “p-value = 0.001”
cat('\n')# PCA
tax_pca <- prcomp(data_tax_mat)
# explained variance
ex_var <- tax_pca$sdev ^2
prop_ex_var <- ex_var/sum(ex_var)*100
loadings <- as.data.frame(tax_pca$rotation[, 1:2]) %>% rownames_to_column("Genus") %>%
mutate(PC1_abs = abs(PC1), PC2_abs = abs(PC2))
tax_scores <- as.data.frame(tax_pca$x[,1:2]) %>% rownames_to_column("sample") %>%
left_join(data_tax_meta2, by="sample")
loading_top <- loadings %>%
slice_max(n = 5, order_by = PC1_abs) %>%
bind_rows(loadings %>% slice_max(n = 5, order_by = PC2_abs))
g1 <- tax_scores %>%
ggplot(aes(PC1, PC2, color=asv_core)) +
geom_point() +
geom_segment(data = loading_top, inherit.aes = FALSE, aes(x = 0, y = 0, xend = PC1*6, yend = PC2*6),
arrow = arrow(length = unit(0.15, "cm"))) +
annotate("label", x = (loading_top$PC1*6), y = (loading_top$PC2*6),
label = loading_top$Genus, size=3, label.size=0.1, label.padding=unit(0.05, "lines")) +
theme_bw() +
theme(panel.border = element_rect(colour = "black", size=1))+
scale_fill_manual(values=group.colors) +
labs(x = paste("PC1 (",round(prop_ex_var[1],2),"%)", sep = ""), y = paste("PC2 (",round(prop_ex_var[2],2),"%)", sep = ""),
title = "(A) Genus", color = "ASV Group")# taking only ASVs with known Family
data_tax <- data_asv_tax %>%
distinct(host_ID, asv_core, Family) %>%
filter(!is.na(Family))
# known ASVs
asv_genus <- data_asv_tax %>%
distinct(asv_ID, Family) %>%
mutate(name = is.na(Family))
print(paste("Percentege of known ASVs:", round(table(asv_genus$name)[1]/nrow(asv_genus)*100, 2)))[1] “Percentege of known ASVs: 90.72”
cat('\n')# how many Family?
print(paste("Number of families:", length(unique(data_tax$Family))))[1] “Number of families: 55”
cat('\n')# making unique samples (host_ID+asv_core)
data_tax_meta <- data_tax %>%
arrange(host_ID) %>%
unite(col="sample", host_ID:asv_core, remove = FALSE)
data_tax_meta2 <- data_tax_meta %>% distinct(sample, asv_core)
data_tax_summary <- data_tax %>%
count(asv_core, Family) %>%
spread(Family, n, fill = 0)
# transforming to matrix
data_tax_mat <- data_tax_meta %>%
select(sample, Family) %>%
mutate(n=1) %>%
spread(Family, n, fill = 0) %>%
column_to_rownames("sample") %>%
as.matrix()
# Perform PERMANOVA
permanova_result <- adonis2(data_tax_mat ~ asv_core, data = data_tax_meta2, method = "jaccard", permutations = 999)
print(paste("F =", permanova_result$F[1]))[1] “F = 188.830216433384”
cat('\n')print(paste("p-value =", permanova_result$`Pr(>F)`[1]))[1] “p-value = 0.001”
cat('\n')# PCA
tax_pca <- prcomp(data_tax_mat)
# explained variance
ex_var <- tax_pca$sdev ^2
prop_ex_var <- ex_var/sum(ex_var)*100
loadings <- as.data.frame(tax_pca$rotation[, 1:2]) %>% rownames_to_column("Family") %>%
mutate(PC1_abs = abs(PC1), PC2_abs = abs(PC2))
tax_scores <- as.data.frame(tax_pca$x[,1:2]) %>% rownames_to_column("sample") %>%
left_join(data_tax_meta2, by="sample")
loading_top <- loadings %>%
slice_max(n = 5, order_by = PC1_abs) %>%
bind_rows(loadings %>% slice_max(n = 5, order_by = PC2_abs))
g2 <- tax_scores %>%
ggplot(aes(PC1, PC2, color=asv_core)) +
geom_point() +
geom_segment(data = loading_top, inherit.aes = FALSE, aes(x = 0, y = 0, xend = PC1*4, yend = PC2*4),
arrow = arrow(length = unit(0.15, "cm"))) +
annotate("label", x = (loading_top$PC1*4), y = (loading_top$PC2*4),
label = loading_top$Family, size=3, label.size=0.1, label.padding=unit(0.05, "lines")) +
scale_x_continuous(limits = c(-2, 2.7)) +
theme_bw() +
theme(panel.border = element_rect(colour = "black", size=1))+
scale_fill_manual(values=group.colors) +
labs(x = paste("PC1 (",round(prop_ex_var[1],2),"%)", sep = ""), y = paste("PC2 (",round(prop_ex_var[2],2),"%)", sep = ""),
title = "(B) Family", color = "ASV Group")# plotting
g1 + g2 + plot_layout(guides='collect') &
theme(legend.position='bottom')cat('\n','\n')# loop for three groups
for (i in 1:3) {
cat('##',core_names[i],'{.tabset}','\n','\n')
cat('### ASVs degree distribution','\n')
print(asv_degree_distribution_three_groups[[i]])
cat('\n','\n')
# calculating connectance
connectance_data <- modules_table_three_groups %>%
filter(asv_core == core_names[i])
cat('No. of hosts: ', length(unique(connectance_data$host_ID)) ,'\n','\n')
cat('No. of ASVs: ', length(unique(connectance_data$asv_ID)) ,'\n','\n')
cat('Connectance: ', nrow(connectance_data) / (length(unique(connectance_data$host_ID)) * length(unique(connectance_data$asv_ID))) ,'\n','\n')
cat('### Modules','\n')
cat('The color indicates number of host individuals in the module / total number of hosts in the whole grid [%]','\n','\n')
print(modules_three_groups[[i]])
cat('\n','\n')
cat('### Modules size','\n')
print(modules_size_three_groups[[i]])
cat('\n','\n')
cat('### No. of land uses','\n')
print(modules_grid_three_groups[[i]])
cat('\n','\n')
}No. of hosts: 853
No. of ASVs: 103
Connectance: 0.3439147
The color indicates number of host individuals in the module / total number of hosts in the whole grid [%]
No. of hosts: 855
No. of ASVs: 1188
Connectance: 0.06003308
The color indicates number of host individuals in the module / total number of hosts in the whole grid [%]
No. of hosts: 829
No. of ASVs: 660
Connectance: 0.0148006
The color indicates number of host individuals in the module / total number of hosts in the whole grid [%]
library(ggsignif)
#library(ggbreak)
modules_sizes <- modules_table_three_groups %>%
group_by(asv_core, host_group) %>%
summarise(n = n_distinct(host_ID))
# summary
cat('Summary','\n')Summary
modules_size_summary <- modules_sizes %>%
group_by(asv_core) %>%
summarise(mean = mean(n), sd = sd(n))
print(knitr::kable(modules_size_summary))| asv_core | mean | sd |
|---|---|---|
| Core | 77.545455 | 116.355802 |
| Non-core | 10.961538 | 45.098001 |
| Rare | 4.791907 | 2.617593 |
cat('\n')# ANOVA between groups
cat('ANOVA','\n')ANOVA
anova_result <- aov(n ~ asv_core, data = modules_sizes)
print(paste("F =", summary(anova_result)[[1]][["F value"]][1]))[1] “F = 24.2614554816964”
cat('\n')print(paste("p-value =", summary(anova_result)[[1]][["Pr(>F)"]][1]))[1] “p-value = 2.19880229088941e-10”
cat('\n')# Perform Tukey's HSD post hoc test
cat('Tukey post-hoc','\n')Tukey post-hoc
tukey_result <- TukeyHSD(anova_result)
# Extract the test statistics (q values) from the Tukey HSD results
tukey_q <- tukey_result$asv_core[, "diff"] / tukey_result$asv_core[, "lwr"]
tukey_result <- cbind(tukey_result$asv_core, q_value = tukey_q) %>%
as.data.frame() %>%
rownames_to_column("comp")
print(knitr::kable(tukey_result))| comp | diff | lwr | upr | p adj | q_value |
|---|---|---|---|---|---|
| Non-core-Core | -66.583916 | -92.12682 | -41.041011 | 0.0000000 | 0.7227419 |
| Rare-Core | -72.753547 | -97.41442 | -48.092674 | 0.0000000 | 0.7468458 |
| Rare-Non-core | -6.169631 | -16.98610 | 4.646836 | 0.3718674 | 0.3632165 |
cat('\n') g1 <- modules_sizes %>%
ggplot(aes(x=asv_core, y=n, fill=asv_core)) +
geom_boxplot(outliers = FALSE, alpha = 0.9) +
geom_jitter(color="black", size=1) +
theme_classic() +
#scale_y_continuous(limits = c(0, 145)) +
#scale_y_break(c(50,130), space=0.4, ticklabels = c(130,140))+
geom_signif(comparisons = list(c("Core", "Non-core"),c("Core", "Rare"),c("Non-core", "Rare")),
annotations = c("***","***","ns"), y_position = c(90,100,80), tip_length = 0) +
geom_point(data = modules_size_summary, aes(x=asv_core, y = mean), position = position_dodge(width = 0.75), size = 2, color="#d62828",alpha=0.8, show.legend=F) +
theme(axis.text = element_text(size = 11, color = 'black'), title = element_text(size = 15), legend.position = "none") +
scale_fill_manual(values=group.colors) +
labs(x="ASV Group", y="Module Size", title = "(A)")modules_grids <- modules_table_three_groups %>%
group_by(asv_core, host_group) %>%
summarise(n = n_distinct(grid))
# summary
cat('Summary','\n')Summary
modules_grids_summary <- modules_grids %>%
group_by(asv_core) %>%
summarise(mean = mean(n), sd = sd(n))
print(knitr::kable(modules_grids_summary))| asv_core | mean | sd |
|---|---|---|
| Core | 6.000000 | 1.264911 |
| Non-core | 2.910256 | 1.722169 |
| Rare | 3.028902 | 1.193138 |
cat('\n')# ANOVA between groups
cat('ANOVA','\n')ANOVA
anova_result <- aov(n ~ asv_core, data = modules_grids)
print(paste("F =", summary(anova_result)[[1]][["F value"]][1]))[1] “F = 25.439235906012”
cat('\n')print(paste("p-value =", summary(anova_result)[[1]][["Pr(>F)"]][1]))[1] “p-value = 8.18525908221536e-11”
cat('\n')# Perform Tukey's HSD post hoc test
cat('Tukey post-hoc','\n')Tukey post-hoc
tukey_result <- TukeyHSD(anova_result)
# Extract the test statistics (q values) from the Tukey HSD results
tukey_q <- tukey_result$asv_core[, "diff"] / tukey_result$asv_core[, "lwr"]
tukey_result <- cbind(tukey_result$asv_core, q_value = tukey_q) %>%
as.data.frame() %>%
rownames_to_column("comp")
print(knitr::kable(tukey_result))| comp | diff | lwr | upr | p adj | q_value |
|---|---|---|---|---|---|
| Non-core-Core | -3.0897436 | -4.1331795 | -2.0463077 | 0.0000000 | 0.7475464 |
| Rare-Core | -2.9710983 | -3.9785029 | -1.9636937 | 0.0000000 | 0.7467880 |
| Rare-Non-core | 0.1186453 | -0.3232108 | 0.5605015 | 0.8021089 | -0.3670834 |
cat('\n') g2 <- modules_grids %>%
ggplot(aes(x=asv_core, y=n, fill=asv_core)) +
geom_boxplot(outliers = FALSE, alpha = 0.9) +
geom_jitter(color="black", size=1) +
theme_classic() +
scale_y_continuous(limits = c(0, 9), breaks = seq(0,8,by=2)) +
geom_signif(comparisons = list(c("Core", "Non-core"),c("Core", "Rare"),c("Non-core", "Rare")),
annotations = c("***","***","ns"), y_position = c(7.7,8.5,7), tip_length = 0) +
geom_point(data = modules_grids_summary, aes(x=asv_core, y = mean), position = position_dodge(width = 0.75), size = 2, color="#d62828",alpha=0.8, show.legend=F) +
theme(axis.text = element_text(size = 11, color = 'black'), title = element_text(size = 15), legend.position = "none") +
scale_fill_manual(values=group.colors) +
labs(x="ASV Group", y="No. of Land Uses", title = "(B)")modules_per_grid <- modules_table_three_groups %>%
group_by(asv_core, grid) %>%
summarise(n = n_distinct(host_group))
# summary
cat('Summary','\n')Summary
modules_per_grid_summary <- modules_per_grid %>%
group_by(asv_core) %>%
summarise(mean = mean(n), sd = sd(n))
print(knitr::kable(modules_per_grid_summary))| asv_core | mean | sd |
|---|---|---|
| Core | 9.428571 | 1.618347 |
| Non-core | 32.428571 | 13.339879 |
| Rare | 74.857143 | 29.390637 |
cat('\n')# ANOVA between groups
cat('ANOVA','\n')ANOVA
anova_result <- aov(n ~ asv_core, data = modules_per_grid)
print(paste("F =", summary(anova_result)[[1]][["F value"]][1]))[1] “F = 22.152152106511”
cat('\n')print(paste("p-value =", summary(anova_result)[[1]][["Pr(>F)"]][1]))[1] “p-value = 1.40213584683407e-05”
cat('\n')# Perform Tukey's HSD post hoc test
cat('Tukey post-hoc','\n')Tukey post-hoc
tukey_result <- TukeyHSD(anova_result)
# Extract the test statistics (q values) from the Tukey HSD results
tukey_q <- tukey_result$asv_core[, "diff"] / tukey_result$asv_core[, "lwr"]
tukey_result <- cbind(tukey_result$asv_core, q_value = tukey_q) %>%
as.data.frame() %>%
rownames_to_column("comp")
print(knitr::kable(tukey_result))| comp | diff | lwr | upr | p adj | q_value |
|---|---|---|---|---|---|
| Non-core-Core | 23.00000 | -2.453251 | 48.45325 | 0.0805762 | -9.375316 |
| Rare-Core | 65.42857 | 39.975321 | 90.88182 | 0.0000105 | 1.636724 |
| Rare-Non-core | 42.42857 | 16.975321 | 67.88182 | 0.0013191 | 2.499427 |
cat('\n') g3 <- modules_per_grid %>%
ggplot(aes(x=asv_core, y=n, fill=asv_core)) +
geom_boxplot(outliers = FALSE, alpha = 0.9) +
geom_jitter(color="black", size=1) +
theme_classic() +
scale_y_continuous(limits = c(0, 60)) +
geom_signif(comparisons = list(c("Core", "Non-core"),c("Core", "Rare"),c("Non-core", "Rare")),
annotations = c("***","***","ns"), y_position = c(45,55,50), tip_length = 0) +
geom_point(data = modules_per_grid_summary, aes(x=asv_core, y = mean), position = position_dodge(width = 0.75), size = 2, color="#d62828",alpha=0.8, show.legend=F) +
theme(axis.text = element_text(size = 11, color = 'black'), title = element_text(size = 15), legend.position = "none") +
scale_fill_manual(values=group.colors) +
labs(x="ASV Group", y="No. of Modules per Land Use", title = "(C)")# plotting
g1+g2+g3 + plot_layout(axis_titles = "collect_x")# combining final results
assembly_final <- assembly_three_groups %>%
mutate(same_module = ifelse(host_group1==host_group2, "Same","Different"))
# renaming the process
assembly_final %<>% mutate(process = case_when(process=="Heterogeneous.Selection" ~ "Heterogeneous Selection",
process=="Homogeneous.Selection" ~ "Homogeneous Selection",
process=="Dispersal.Limitation" ~ "Dispersal Limitation",
process=="Homogenizing.Dispersal" ~ "Homogenizing Dispersal",
.default = "Drift"))
# summary
assembly_summary <- assembly_final %>%
count(asv_core, same_module, process)
assembly_summary_total <- assembly_summary %>%
group_by(asv_core) %>%
summarise(n_total = sum(n))
assembly_summary_total_module <- assembly_summary %>%
group_by(asv_core, same_module) %>%
summarise(n_total = sum(n))
#plotting process ratio between groups
g1 <- assembly_summary %>%
group_by(asv_core, process) %>%
summarise(n = sum(n)) %>%
left_join(assembly_summary_total, by=c("asv_core")) %>%
mutate(n_p = n/n_total) %>%
ggplot(aes(fill=process, x=asv_core, y=n_p)) +
geom_bar(position="fill", stat="identity") +
scale_y_continuous(labels = scales::percent) +
geom_text(aes(label = paste0(round(n_p*100,1),"%")),
position = position_stack(vjust = 0.5), size = 3)+
theme_bw() +
theme(axis.text = element_text(size = 11, color = 'black', angle = 90, vjust = 0.5, hjust=1), title = element_text(size = 14),
strip.text = element_text(size=12, color = 'black'), strip.background = element_rect(color = "grey80", size = 1),
panel.grid = element_blank(), panel.background = element_rect(colour = "black")) +
scale_fill_manual(values=c("#adc178","#d6ccc2","#f07167","#83c5be")) +
labs(x="ASVs Groups", y="Percentage")
print(g1)#plotting process ratio between modules
g2 <- assembly_summary %>%
left_join(assembly_summary_total_module, by=c("asv_core","same_module")) %>%
mutate(n_p = n/n_total) %>%
ggplot(aes(fill=process, x=same_module, y=n_p)) +
geom_bar(position="fill", stat="identity") +
facet_wrap(~asv_core) +
scale_y_continuous(labels = scales::percent) +
geom_text(aes(label = paste0(round(n_p*100,1),"%")),
position = position_stack(vjust = 0.5), size = 3)+
theme_bw() +
theme(axis.text = element_text(size = 11, color = 'black', angle = 90, vjust = 0.5, hjust=1), title = element_text(size = 14),
strip.text = element_text(size=12, color = 'black'), strip.background = element_rect(color = "grey80", size = 1),
panel.grid = element_blank(), panel.background = element_rect(colour = "black")) +
scale_fill_manual(values=c("#adc178","#d6ccc2","#f07167","#83c5be")) +
labs(x="", y="Percentage")
print(g2)# reading grid similarity results
grids_similarity_attr <- read_csv("../../data/data_processed/village_summary_new.csv")
rownames(grids_similarity_attr) <- grids_similarity_attr$grid_village
# correlations between variables
print(psych::pairs.panels(grids_similarity_attr %>% select(-grid_village,-village,-grid), ellipses = F, lm = F))NULL
cat('\n','\n')# RDA for modules
anova_model_all <- NULL
anova_rda_all <- NULL
score_modules_top_all <- NULL
rda_figs <- list()
# loop for three asv groups
for (i in 1:3) {
modules_similarity2 <- modules_similarity2_three_groups[[i]]
grid_names <- rownames(modules_similarity2)
# filtering matrix to the existing grids
grids_similarity_attr2 <- grids_similarity_attr %>% filter(grid_village %in% grid_names & !grepl("village", grid_village))
modules_similarity2 <- modules_similarity2[rownames(grids_similarity_attr2),]
a=data_asv %>% group_by(village,grid) %>% summarise(n=n_distinct(host_ID))
grids_similarity_attr2 %<>% left_join(a, by=c("village","grid"))
# tb-RDA
# hellinger transformation
modules_similarity2_hell <- decostand (modules_similarity2, 'hell')
rda_result <- rda(modules_similarity2 ~ veg_PC1+veg_PC2+dist_to_village+elevation+Condition(n), data=grids_similarity_attr2, na.action = na.omit)
# **Condition(n) - control for the number of host in each village-grid.
# variation explained
constrained_eig <- t(as.data.frame(rda_result$CCA$eig/rda_result$tot.chi*100)) # for RDAs
# unconstrained_eig <- rda_result$CA$eig/rda_result$tot.chi*100
# expl_var <- c(constrained_eig, unconstrained_eig)
# barplot (expl_var[1:20], col = c(rep ('red', length (constrained_eig)), rep ('black', length (unconstrained_eig))),
# las = 2, ylab = '% variation')
R2.obs <- RsquareAdj (rda_result)$adj.r.squared
# significance test for the whole model
anova_model <- anova(rda_result)[1,] %>%
mutate(R2.adj = R2.obs, asv_core = core_names[i])
anova_model_all <- rbind(anova_model_all, anova_model)
# significance test for all variables
anova_rda <- anova(rda_result, by = "margin", permutations = 999) %>%
mutate(asv_core = core_names[i])
anova_rda$`Pr(>F)` <- p.adjust (anova_rda$`Pr(>F)`, method = 'holm')
anova_rda_all <- rbind(anova_rda_all, anova_rda)
# scores
score_modules <- as.data.frame(vegan::scores(rda_result)$species) %>%
mutate(RDA1_abs = abs(RDA1), RDA2_abs = abs(RDA2))
score_modules_top <- score_modules %>%
slice_max(n = 5, order_by = RDA1_abs) %>%
bind_rows(score_modules %>% slice_max(n = 5, order_by = RDA2_abs)) %>%
rownames_to_column("host_group") %>%
mutate(asv_core = core_names[i])
score_modules_top_all <- rbind(score_modules_top_all, score_modules_top)
# plotting
# samples
rda_samples <- as.data.frame(scores(rda_result)$sites) %>%
rownames_to_column("grid_village") %>%
left_join(grids_similarity_attr2[c("grid_village","village","grid")], by="grid_village")
# variables
rda_var <- as.data.frame(rda_result$CCA$biplot)
# plot
g <- rda_samples %>%
ggplot(aes(RDA1, RDA2, color=grid, shape=village)) +
geom_hline(yintercept = 0, linetype = "dotted", color = "grey") +
geom_vline(xintercept = 0, linetype = "dotted", color = "grey") +
geom_point(size = 4, alpha=0.6) +
geom_segment(data = rda_var, inherit.aes = FALSE, aes(x = 0, y = 0, xend = (RDA1*0.85), yend = (RDA2*0.85)),
arrow = arrow(length = unit(0.15, "cm"))) +
geom_text(data = rda_var,inherit.aes = FALSE, aes(x = RDA1, y = RDA2, label=rownames(rda_var)))+
theme_bw() +
theme(panel.border = element_rect(colour = "black", size=1), panel.grid = element_blank())+
labs(x = paste("RDA1 (",round(constrained_eig[1],2),"%)", sep = ""), y = paste("RDA2 (",round(constrained_eig[2],2),"%)", sep = ""),
title = paste(core_names[i]), color = "Land Use", shape = "Village")
rda_figs <- append(rda_figs, list(g))
}
print(knitr::kable(anova_model_all))| Df | Variance | F | Pr(>F) | R2.adj | asv_core | |
|---|---|---|---|---|---|---|
| Model | 4 | 8.153774 | 1.069403 | 0.372 | 0.0114267 | Core |
| Model1 | 4 | 12.241210 | 1.458269 | 0.046 | 0.0976983 | Non-core |
| Model2 | 4 | 13.915078 | 1.163366 | 0.110 | 0.0392719 | Rare |
cat('\n','\n')print(knitr::kable(anova_rda_all))| Df | Variance | F | Pr(>F) | asv_core | |
|---|---|---|---|---|---|
| veg_PC1 | 1 | 0.5655093 | 0.2966759 | 1.000 | Core |
| veg_PC2 | 1 | 1.5189484 | 0.7968665 | 1.000 | Core |
| dist_to_village | 1 | 0.4230898 | 0.2219602 | 1.000 | Core |
| elevation | 1 | 4.4679061 | 2.3439405 | 0.112 | Core |
| Residual | 11 | 20.9676684 | NA | NA | Core |
| veg_PC11 | 1 | 1.4783178 | 0.7044354 | 1.000 | Non-core |
| veg_PC21 | 1 | 2.1010753 | 1.0011865 | 1.000 | Non-core |
| dist_to_village1 | 1 | 1.9840607 | 0.9454277 | 1.000 | Non-core |
| elevation1 | 1 | 7.0429163 | 3.3560304 | 0.004 | Non-core |
| Residual1 | 11 | 23.0844391 | NA | NA | Non-core |
| veg_PC12 | 1 | 2.3809244 | 0.7962259 | 1.000 | Rare |
| veg_PC22 | 1 | 3.7684380 | 1.2602365 | 0.336 | Rare |
| dist_to_village2 | 1 | 2.7102902 | 0.9063720 | 1.000 | Rare |
| elevation2 | 1 | 5.7427517 | 1.9204841 | 0.004 | Rare |
| Residual2 | 11 | 32.8928887 | NA | NA | Rare |
cat('\n','\n')rda_figs[[1]] + rda_figs[[2]] + rda_figs[[3]] + plot_layout(guides='collect') &
theme(legend.position='bottom')cat('\n','\n')library(caret)
library(yardstick)
library(randomForest)
library(ROSE)
# Assume df is your dataframe and "group" is the binary target column
df <- read_csv("../data/data_processed/ML_module/ML_rattus_three_villages_core.csv") %>%
select(-host_ID.x,-host_ID.y) %>%
mutate(module=as.factor(module), grid=as.factor(grid), sex=as.factor(sex), season=as.factor(season))
# Set seed for reproducibility
set.seed(123)
# Split the data into training and test sets
train_index <- createDataPartition(df$module, p = 0.8, list = FALSE)
train_df <- df[train_index, ]
test_df <- df[-train_index, ]
# Define the control for cross-validation
control <- trainControl(method = "cv", number = 5)
# Create a function for under-sampling
undersample <- function(df) {
# Separate features and target
features <- df[, -which(names(df) == "module")]
target <- df$module
# Combine into a new dataframe
data <- cbind(features, module = target)
# Perform under-sampling using ROSE package
balanced_data <- ovun.sample(module ~ ., data = data, method = "under", seed = 1)$data
return(balanced_data)
}
# Apply under-sampling to the training dataframe
balanced_train_df <- undersample(train_df)
# Separate features and target for the balanced training dataset
X_train <- balanced_train_df[, -which(names(balanced_train_df) == "module")]
y_train <- as.factor(balanced_train_df$module)
# Define the training model using random forest
rf_model <- train(X_train, y_train,
method = "rf",
trControl = control,
metric = "Accuracy")
# Print the model details
print(rf_model$finalModel)
# Evaluate the model on the test set
X_test <- test_df[, -which(names(test_df) == "module")]
y_test <- as.factor(test_df$module)
predictions_prob <- predict(rf_model$finalModel, X_test, type = "prob")[,2]
predictions_class <- predict(rf_model$finalModel, X_test)
# Convert predictions to a data frame for yardstick
results <- data.frame(
truth = y_test,
predicted_prob = predictions_prob,
predicted_class = predictions_class
)
# Calculate precision
precision_val <- yardstick::precision(results, truth = truth, estimate = predicted_class)
print(paste("Precision:", precision_val$.estimate))
# Calculate recall
recall_val <- yardstick::recall(results, truth = truth, estimate = predicted_class)
print(paste("Recall:", recall_val$.estimate))
# Calculate F1 score
f1_val <- yardstick::f_meas(results, truth = truth, estimate = predicted_class)
print(paste("F1 Score:", f1_val$.estimate))
# Plot ROC Curve
auc_roc_score <- yardstick::roc_auc(results, truth, predicted_prob)
roc_obj <- yardstick::roc_curve(results, truth, predicted_prob) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity)) +
geom_path(color = "blue") +
geom_abline(lty = 3) +
coord_equal() +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))+
labs(title = "ROC Curve", subtitle = paste("AUC = ", round(auc_roc_score$.estimate,3)))
print(roc_obj)
cat('\n','\n')
# Plot Precision-Recall Curve
auc_pr_score <- yardstick::pr_auc(results, truth = truth, predicted_prob)
pr_obj <- yardstick::pr_curve(results, truth = truth, predicted_prob) %>%
ggplot(aes(x = recall, y = precision)) +
geom_path(color = "blue") +
coord_equal() +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))+
labs(title = "Precision-Recall curve", subtitle = paste("AUC = ", round(auc_pr_score$.estimate,3)))
print(pr_obj)
cat('\n','\n')
# Feature importance analysis
# Extract the final random forest model
final_rf <- rf_model$finalModel
# Calculate feature importance
importance <- importance(final_rf)
var_importance <- data.frame(Variables = rownames(importance),
Importance = importance[, 1])
# Plot feature importance
ggplot(var_importance, aes(x = reorder(Variables, Importance), y = Importance)) +
geom_bar(stat = "identity") +
coord_flip() +
ggtitle("Feature Importance") +
xlab("Variables") +
ylab("Importance (Mean Decrease in Gini)") +
theme_minimal()